# -*- coding: utf-8 -*-
# coding:unicode_escape
import requests as rq 
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
import json

def all_paras():
    parameters ={
        "func": "find-b",
        "find_code": "ISB",
        "request": "",
        "local_base": "",
        "filter_code_1": "WLN",
        "filter_request_1": "",
        "filter_code_2": "WYR",
        "filter_request_2": "",
        "filter_code_3": "WYR",
        "filter_request_3": "",
        "filter_code_4": "WFM",
        "filter_request_4": "",
        "filter_code_5": "WSL",
        "filter_request_5": ""
    }
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66'
    }
    firstUrl = "http://opac.nlc.cn/F"
    return parameters,headers,firstUrl


def rawlink_handler(rawlink):
    baseString = rawlink[:75]
    baseNum = rawlink[75:80]
    baseNum = str(int(baseNum) - 1)
    if(len(baseNum)<5):
        baseNum = "".join(['0'*(5-len(baseNum)),baseNum])
    return "".join([baseString,baseNum])

def get_emlemnt(headers,parameters,firstUrl):
    resp = rq.get(firstUrl)
    #从firstUrl中获取第一个比较关键的Url-getUrl
    soup = BeautifulSoup(resp.text,'html.parser')
    a_labels = soup.find(name = 'a',class_='gblue1',attrs={'href': True})
    rawlink = a_labels.get('href')
    getUrl = rawlink_handler(rawlink)

    #从getUrl中对应的页面获得最终提交的网址herf
    res = rq.get(getUrl,headers=headers,params=parameters)
    soup_1 = BeautifulSoup(res.text,'html.parser')
    href = soup_1.find("form").get("action")
    res_1 = rq.get(href,headers=headers,params=parameters)
    res_1.encoding = "utf-8"

    #从herf中的网址（目的网址）提取1.出版项；2.内容提要；3.中图分类号；
    soup_2 = BeautifulSoup(res_1.text,'html.parser')
    td_labels = soup_2.find_all("td",class_="td1")
    list_td = []
    for td in td_labels:
        if(td.a != None): #有些信息存放在a标签中
            temp = td.a.string
        else:
            temp = td.string
        if temp != None:
            temp = temp.strip().strip('\n').strip().replace("\xa0","")
        list_td.append(temp)
    return list_td

def tojson(list_td):
    # check = ["出版项","内容提要","中图分类号"]
    dic={}
    for i in range(len(list_td)):
        if list_td[i] == "中图分类号":
            dic[list_td[i]] = list_td[i+1] 
    return json.dumps(dic,ensure_ascii=False).encode("utf-8")
    # print(json.dumps(dic,ensure_ascii=False))

def toresult(list_td):
    for i in range(len(list_td)):
        if list_td[i] == "中图分类号":
            # print(list_td[i+1])
            return list_td[i+1]
    return None



def run(isbn):
    parameters,headers,firstUrl  = all_paras()
    parameters["request"] = isbn
    li = get_emlemnt(headers,parameters,firstUrl)
    return toresult(li)


if __name__ == "__main__":
    isbn = "9787115291578"
    run(isbn)








